import numpy as np
from sklearn.datasets import make_regression
from scipy.spatial.distance import norm
from itertools import product
from collections import OrderedDict
from plotly.graph_objs import *
import plotly.tools as tls
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()
import time
def gradient_descent(X, y,
cost_function, gradient_of_cost_function,
initial_guess, learning_rate=.1,
threshold=1e-3, max_iter=1e3):
params = initial_guess
param_history = [(initial_guess[0], initial_guess[1], cost_function(X, y, params))]
improvement = threshold
old_cost = 100000
iterations = 0
time_history = [time.time()]
while (norm(gradient_of_cost_function(X, y, params)) >= threshold
and iterations < max_iter):
iterations += 1
params -= learning_rate*gradient_of_cost_function(X, y, params)
cost = cost_function(X, y, params)
param_history.append((params[0], params[1], cost)) # for plotting
improvement = np.abs(cost - old_cost)/old_cost
old_cost = cost
time_history.append(time.time())
if iterations == max_iter:
print "max iterations reached"
print "Final gradient of cost function %s" %gradient_of_cost_function(X, y, params)
print "Final params %s" %params
return param_history, time_history
def stochastic_gradient_descent(X, y,
cost_function, gradient_of_cost_function,
initial_guess, learning_rate=.1,
threshold=1e-3, max_iter=1e3, batch_size=1):
batch_size = min(batch_size, X.shape[0])
params = initial_guess
param_history = [(initial_guess[0], initial_guess[1])]
improvement = threshold
old_cost = 100000
iterations = 0
time_history = [time.time()]
while (norm(gradient_of_cost_function(X, y, params)) >= threshold
and iterations < max_iter):
# select indices of mini-batch
min_index = batch_size*iterations % X.shape[0]
indices = []
while len(indices) < batch_size:
indices.append((min_index + len(indices)) % X.shape[0])
Xi, yi = X[indices], y[indices]
# update parameters
params -= learning_rate*gradient_of_cost_function(Xi, yi, params)
cost = cost_function(Xi, yi, params)
param_history.append((params[0], params[1])) # for plotting
improvement = np.abs(cost - old_cost)/old_cost
old_cost = cost
iterations += 1
time_history.append(time.time())
if iterations == max_iter:
print "max iterations reached"
print "Final gradient of cost function %s" %gradient_of_cost_function(X, y, params)
print "Final params %s" %params
return param_history, time_history
def plot_results(X, y, cost_function, param_history):
params = param_history[-1][0:2]
x_params = np.array([params[0] - (params[0]-p[0]) for p in param_history] +
[params[0] + (params[0]-p[0]) for p in param_history])
x_params.sort()
y_params = np.array([params[1] - (params[1]-p[1]) for p in param_history] +
[params[1] + (params[1]-p[1]) for p in param_history])
y_params.sort()
samples = list(product(x_params, y_params))
costs = [cost_function(X, y, np.array([p[0], p[1]])) for p in samples]
costs = np.reshape(costs, (len(x_params), -1))
cost_surface = Surface(
x = x_params,
y = y_params,
z = costs,
colorscale = [[0, 'rgb(31,119,180)'],
[0.5, 'rgb(143, 123, 196)'],
[1, 'rgb(255,127,97)']],
name='Cost Function'
)
param_history = Scatter3d(
x = x_params,
y = y_params,
z = [p[2] for p in param_history],
mode = 'lines+markers'
)
data_3d_plot = Data([cost_surface, param_history])
layout = Layout(
scene = dict(
xaxis = dict(title='beta0'),
yaxis = dict(title='beta1'),
zaxis = dict(title='cost')
))
figure_3d = Figure(data=data_3d_plot, layout=layout)
return figure_3d
def plot_sgd_results(X, y, cost_function, param_history):
x_history = [p[0] for p in param_history]
y_history = [p[1] for p in param_history]
x_params = np.linspace(min(x_history),
max(x_history),
100)
y_params = np.linspace(min(y_history),
max(y_history),
100)
samples = list(product(x_params, y_params))
demo_points = OrderedDict()
for p in param_history:
best_sample = samples[0]
min_distance = ((p[0]-best_sample[0])**2+(p[1]-best_sample[1])**2)**.5
for sample in samples:
d = ((p[0]-sample[0])**2+(p[1]-sample[1])**2)**.5
if d < min_distance:
best_sample = sample
min_distance = d
demo_points[tuple(p)] = best_sample
costs = [cost_function(X, y, np.array([p[0], p[1]])) for p in samples]
costs = np.reshape(costs, (len(x_params), -1))
cost_surface = Surface(
x = x_params,
y = y_params,
z = costs,
colorscale = [[0, 'rgb(31,119,180)'],
[0.5, 'rgb(143, 123, 196)'],
[1, 'rgb(255,127,97)']],
name='Cost Function'
)
param_history = Scatter3d(
x = [d[0] for d in demo_points],
y = [d[1] for d in demo_points],
z = [cost_function(X, y, d) for d in demo_points],
mode = 'lines+markers'
)
data_3d_plot = Data([cost_surface, param_history])
layout = Layout(
scene = dict(
xaxis = dict(title='beta0'),
yaxis = dict(title='beta1'),
zaxis = dict(title='cost')
))
figure_3d = Figure(data=data_3d_plot, layout=layout)
return figure_3d
def predict(X, params):
y_predicted = X.dot(params)
return y_predicted
X, y = make_regression(n_samples = int(1e5), n_features = 2, n_informative=2, random_state=0, noise=10)
X = (X - X.mean(axis=0))/X.std()
data = Scatter3d(
x=X[:,0],
y=X[:,1],
z=y,
mode='markers',
marker={'size':1}
)
layout = Layout(
scene = dict(
xaxis = dict(title='x0'),
yaxis = dict(title='x1'),
zaxis = dict(title='y')
)
)
figure_ = Figure(data=[data], layout=layout)
iplot(figure_)
# LINEAR REGRESSION WITHOUT REGULARIZATION
def ols_cost_function(X, y, params):
'''
OLS from linear regression
'''
n_observations = X.shape[0]
avg_squared_residuals = ((predict(X, params) - y)**2).sum()/(2*n_observations)
return avg_squared_residuals
def ols_gradient_of_cost_function(X, y, params):
n_observations = X.shape[0]
gradient = (predict(X, params) - y).dot(X)/n_observations
return gradient
gd_param_history, gd_time_history = gradient_descent(X, y, ols_cost_function, ols_gradient_of_cost_function,
initial_guess = np.array([0., 0.]))
figure_3d = plot_results(X, y, ols_cost_function, gd_param_history)
iplot(figure_3d)
# LINEAR REGRESSION WITH L2 REGULARIZATION
LAMBDA_= 10000.
def ridge_cost_function(X, y, params, lambda_=LAMBDA_):
'''
OLS from linear regression
'''
n_observations = X.shape[0]
avg_squared_residuals = (((predict(X, params) - y)**2).sum()
+ lambda_*(params**2).sum())/(2*n_observations)
return avg_squared_residuals
def ridge_gradient_of_cost_function(X, y, params, lambda_=LAMBDA_):
n_observations = X.shape[0]
gradient = ((predict(X, params) - y).dot(X)
+ lambda_*params)/n_observations
return gradient
ridge_param_history, ridge_time_history = gradient_descent(X, y, ridge_cost_function, ridge_gradient_of_cost_function,
initial_guess = np.array([0., 0.]))
figure_3d = plot_results(X, y, ridge_cost_function, ridge_param_history)
iplot(figure_3d)
# LASSO
PARAM_HISTORY = []
def lasso_cost_function(X, y, params, lambda_=500.):
'''
OLS from linear regression
'''
n_observations = X.shape[0]
avg_squared_residuals = (((predict(X, params) - y)**2).sum()
+ lambda_*sum(np.abs(params)))/(2*n_observations)
return avg_squared_residuals
x_params = np.linspace(-5, 200, 100)
y_params = np.linspace(-5, 100, 100)
samples = list(product(x_params, y_params))
costs = [lasso_cost_function(X, y, np.array([p[0], p[1]])) for p in samples]
costs = np.reshape(costs, (len(x_params), -1))
cost_surface = Surface(
z = costs,
x = x_params,
y = y_params,
colorscale = [[0, 'rgb(31,119,180)'],
[0.5, 'rgb(143, 123, 196)'],
[1, 'rgb(255,127,97)']],
name='Cost Function'
)
data_3d_plot = Data([cost_surface])
layout = Layout(
scene = dict(
xaxis = dict(title='beta0'),
yaxis = dict(title='beta1'),
zaxis = dict(title='cost')
))
figure_3d = Figure(data=data_3d_plot, layout=layout)
iplot(figure_3d)
# SGD: LINEAR REGRESSION
sgd_param_history, sgd_time_history = stochastic_gradient_descent(X, y, ols_cost_function,
ols_gradient_of_cost_function,
initial_guess=np.array([0., 0.]),
learning_rate=.1)
figure_3d = plot_sgd_results(X, y, ols_cost_function, sgd_param_history)
iplot(figure_3d)
# MINIBATCH SGD: LINEAR REGRESSION
minibatch_param_history, minibatch_time_history = stochastic_gradient_descent(X, y, ols_cost_function,
ols_gradient_of_cost_function,
initial_guess=np.array([0., 0.]),
learning_rate=.1, batch_size=5)
figure_3d = plot_sgd_results(X, y, ols_cost_function, minibatch_param_history)
iplot(figure_3d)
# PLOT GD, SGD, and minibatch SGD convergence on log scale
gd_convergence = Scatter(
x = [i for i,j in enumerate(gd_param_history)],
y = [ols_cost_function(X, y, (p[0], p[1])) for p in gd_param_history],
mode = 'lines',
name = 'Batch Gradient Descent'
)
sgd_convergence = Scatter(
x = [i for i,j in enumerate(sgd_param_history)],
y = [ols_cost_function(X, y, (p[0], p[1])) for p in sgd_param_history],
mode = 'lines',
name = 'SGD'
)
minibatch_convergence = Scatter(
x = [i for i,j in enumerate(minibatch_param_history)],
y = [ols_cost_function(X, y, (p[0], p[1])) for p in minibatch_param_history],
mode = 'lines',
name = 'Mini-batch SGD'
)
layout = Layout(
xaxis=XAxis(
range=[0,150],
title='steps'
),
yaxis=YAxis(
type='log',
autorange=True,
title='cost'
)
)
data = Data([gd_convergence, sgd_convergence, minibatch_convergence])
figure = Figure(data=data, layout=layout)
iplot(figure)
# PLOT GD, SGD, and minibatch SGD convergence on log scale
gd_convergence = Scatter(
x = [t - gd_time_history[0] for t in gd_time_history],
y = [ols_cost_function(X, y, (p[0], p[1])) for p in gd_param_history],
mode = 'lines',
name = 'Batch Gradient Descent'
)
sgd_convergence = Scatter(
x = [t - sgd_time_history[0] for t in sgd_time_history],
y = [ols_cost_function(X, y, (p[0], p[1])) for p in sgd_param_history],
mode = 'lines',
name = 'SGD'
)
minibatch_convergence = Scatter(
x = [t - minibatch_time_history[0] for t in minibatch_time_history],
y = [ols_cost_function(X, y, (p[0], p[1])) for p in minibatch_param_history],
mode = 'lines',
name = 'Mini-batch SGD'
)
layout = Layout(
xaxis=XAxis(
range=[0,1],
title='time'
),
yaxis=YAxis(
type='log',
autorange=True,
)
)
data = Data([gd_convergence, sgd_convergence, minibatch_convergence])
figure = Figure(data=data, layout=layout)
iplot(figure)